{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# [ Chapter 6 - Using Content to Learn Domain-specific Language ]\n",
    "# Query Classification and Disambiguation with Semantic Knowledge Graphs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "NOTE: This notebook depends upon the the Stack Exchange datasets. If you have any issues, please rerun the [Setting up the Stack Exchange Dataset](../ch05/2.index-datasets.ipynb) notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "from pyspark.sql import SparkSession\n",
    "\n",
    "from aips import get_engine\n",
    "from aips import get_semantic_knowledge_graph as get_skg\n",
    "\n",
    "spark = SparkSession.builder.appName(\"AIPS\").getOrCreate()\n",
    "engine = get_engine()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Query Classification"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Listing 6.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_classifications(query, traversal):\n",
    "  classifications = traversal[\"graph\"][0][\"values\"][query][\"traversals\"][0][\"values\"]\n",
    "  print(f\"Query: {query}\") \n",
    "  print(\"  Classifications:\")\n",
    "  for term, data in classifications.items():\n",
    "      print(f'    {term}  {data[\"relatedness\"]}')\n",
    "  print()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Query: docker\n",
      "  Classifications:\n",
      "    devops  0.87978\n",
      "\n",
      "Query: airplane\n",
      "  Classifications:\n",
      "    travel  0.33334\n",
      "\n",
      "Query: airplane AND crash\n",
      "  Classifications:\n",
      "    scifi  0.02149\n",
      "    travel  0.00475\n",
      "\n",
      "Query: vitamins\n",
      "  Classifications:\n",
      "    health  0.48681\n",
      "    cooking  0.09441\n",
      "\n",
      "Query: alien\n",
      "  Classifications:\n",
      "    scifi  0.62541\n",
      "\n",
      "Query: passport\n",
      "  Classifications:\n",
      "    travel  0.82883\n",
      "\n",
      "Query: driver\n",
      "  Classifications:\n",
      "    travel  0.38996\n",
      "    devops  0.08917\n",
      "\n",
      "Query: driver AND taxi\n",
      "  Classifications:\n",
      "    travel  0.24184\n",
      "    scifi  -0.13757\n",
      "\n",
      "Query: driver AND install\n",
      "  Classifications:\n",
      "    devops  0.22277\n",
      "    travel  -0.00675\n",
      "\n"
     ]
    }
   ],
   "source": [
    "def print_query_classification(query, classification_field=\"category\", \n",
    "      classification_limit=5, keywords_field=\"body\", min_occurrences=5):\n",
    "    \n",
    "    nodes_to_traverse = [{\"field\": keywords_field,\n",
    "                          \"values\": [query]},\n",
    "                         {\"field\": classification_field,\n",
    "                          \"min_occurrences\": min_occurrences,\n",
    "                          \"limit\": classification_limit}]\n",
    "    \n",
    "    traversal = skg.traverse(*nodes_to_traverse)\n",
    "    print_classifications(query, traversal)\n",
    "\n",
    "skg = get_skg(engine.get_collection(\"stackexchange\"))\n",
    "\n",
    "print_query_classification(\"docker\", classification_limit=3)\n",
    "print_query_classification(\"airplane\", classification_limit=1)\n",
    "print_query_classification(\"airplane AND crash\", classification_limit=2)\n",
    "print_query_classification(\"vitamins\", classification_limit=2)\n",
    "print_query_classification(\"alien\", classification_limit=1)\n",
    "print_query_classification(\"passport\", classification_limit=1)\n",
    "print_query_classification(\"driver\", classification_limit=2)\n",
    "print_query_classification(\"driver AND taxi\", classification_limit=2)\n",
    "print_query_classification(\"driver AND install\", classification_limit=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Disambiguation"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Listing 6.2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_disambigutaions(query, traversal):\n",
    "    classifications = traversal[\"graph\"][0][\"values\"][query][\"traversals\"][0][\"values\"]\n",
    "    \n",
    "    print(f\"Query: {query}\") \n",
    "    for context, data in classifications.items():\n",
    "        print(f'  Context: {context}  {data[\"relatedness\"]}')\n",
    "        print(\"    Keywords: \")\n",
    "        for keyword, keyword_data in data[\"traversals\"][0][\"values\"].items():\n",
    "            print(f'      {keyword}  {keyword_data[\"relatedness\"]}')\n",
    "        print()\n",
    "\n",
    "def print_query_disambigutaion(query,\n",
    "      context_field=\"category\", context_limit=5,\n",
    "      keywords_field=\"body\", keywords_limit=10, min_occurrences=5):\n",
    "    \n",
    "    nodes_to_traverse = [{\"field\": keywords_field,\n",
    "                          \"values\": [query]},\n",
    "                         {\"field\": context_field,\n",
    "                          \"min_occurrences\": min_occurrences, \n",
    "                          \"limit\": context_limit},\n",
    "                         {\"field\": keywords_field,\n",
    "                          \"min_occurrences\": min_occurrences, \n",
    "                          \"limit\": keywords_limit}]\n",
    "    \n",
    "    traversal = skg.traverse(*nodes_to_traverse)\n",
    "    print_disambigutaions(query, traversal)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Listing 6.3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Query: server\n",
      "  Context: devops  0.83796\n",
      "    Keywords: \n",
      "      server  0.93698\n",
      "      servers  0.76818\n",
      "      docker  0.75955\n",
      "      code  0.72832\n",
      "      configuration  0.70686\n",
      "      deploy  0.70634\n",
      "      nginx  0.70366\n",
      "      jenkins  0.69934\n",
      "      git  0.68932\n",
      "      ssh  0.6836\n",
      "\n",
      "  Context: cooking  -0.1574\n",
      "    Keywords: \n",
      "      server  0.66363\n",
      "      restaurant  0.16482\n",
      "      pie  0.12882\n",
      "      served  0.12098\n",
      "      restaurants  0.11679\n",
      "      knife  0.10788\n",
      "      pieces  0.10135\n",
      "      serve  0.08934\n",
      "      staff  0.0886\n",
      "      dish  0.08553\n",
      "\n",
      "  Context: travel  -0.15959\n",
      "    Keywords: \n",
      "      server  0.81226\n",
      "      tipping  0.54391\n",
      "      vpn  0.45352\n",
      "      tip  0.41117\n",
      "      servers  0.39053\n",
      "      firewall  0.33092\n",
      "      restaurant  0.21698\n",
      "      tips  0.19524\n",
      "      bill  0.18951\n",
      "      cash  0.18485\n",
      "\n",
      "  Context: scifi  -0.28208\n",
      "    Keywords: \n",
      "      server  0.78173\n",
      "      flynn's  0.53341\n",
      "      computer  0.28075\n",
      "      computers  0.2593\n",
      "      flynn  0.24963\n",
      "      servers  0.24778\n",
      "      grid  0.23889\n",
      "      networking  0.2178\n",
      "      shutdown  0.21121\n",
      "      hacker  0.19444\n",
      "\n",
      "Query: driver\n",
      "  Context: travel  0.38996\n",
      "    Keywords: \n",
      "      driver  0.93417\n",
      "      drivers  0.76932\n",
      "      taxi  0.71977\n",
      "      car  0.65572\n",
      "      license  0.61319\n",
      "      driving  0.60849\n",
      "      taxis  0.57708\n",
      "      traffic  0.52823\n",
      "      bus  0.52306\n",
      "      driver's  0.51043\n",
      "\n",
      "  Context: devops  0.08917\n",
      "    Keywords: \n",
      "      ipam  0.78219\n",
      "      driver  0.77583\n",
      "      aufs  0.73758\n",
      "      overlayfs  0.73758\n",
      "      container_name  0.73483\n",
      "      overlay2  0.69079\n",
      "      cgroup  0.68438\n",
      "      docker  0.67529\n",
      "      compose.yml  0.65012\n",
      "      compose  0.55631\n",
      "\n",
      "Query: chef\n",
      "  Context: cooking  0.37731\n",
      "    Keywords: \n",
      "      chef  0.93239\n",
      "      chefs  0.5151\n",
      "      www.pamperedchef.com  0.41292\n",
      "      kitchen  0.39127\n",
      "      restaurant  0.38975\n",
      "      cooking  0.38332\n",
      "      chef's  0.37392\n",
      "      professional  0.36688\n",
      "      nakiri  0.36599\n",
      "      pampered  0.34736\n",
      "\n",
      "  Context: devops  0.34959\n",
      "    Keywords: \n",
      "      chef  0.87653\n",
      "      puppet  0.79142\n",
      "      docs.chef.io  0.7865\n",
      "      ansible  0.73888\n",
      "      www.chef.io  0.72073\n",
      "      learn.chef.io  0.71902\n",
      "      default.rb  0.70194\n",
      "      configuration  0.68296\n",
      "      inspec  0.65237\n",
      "      cookbooks  0.61503\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print_query_disambigutaion(\"server\")\n",
    "print_query_disambigutaion(\"driver\", context_limit=2)\n",
    "print_query_disambigutaion(\"chef\", context_limit=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Listing 6.4\n",
    "#### An SKG disambiguation request for the query `chef`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def print_disambigutaion_request(query, context_field=\"category\", context_limit=5,\n",
    "      keywords_field=\"body\", keywords_limit=10, min_occurrences=5):\n",
    "    \n",
    "    nodes_to_traverse = [{\"field\": keywords_field, \"values\": [query]},\n",
    "                         {\"field\": context_field,\n",
    "                          \"min_occurrences\": min_occurrences, \n",
    "                          \"limit\": context_limit},\n",
    "                         {\"field\": keywords_field,\n",
    "                          \"min_occurrences\": min_occurrences, \n",
    "                          \"limit\": keywords_limit}]\n",
    "    \n",
    "    print(json.dumps(skg.transform_request(*nodes_to_traverse), indent=2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "  \"limit\": 0,\n",
      "  \"params\": {\n",
      "    \"q\": \"*:*\",\n",
      "    \"fore\": \"{!${defType} v=$q}\",\n",
      "    \"back\": \"*:*\",\n",
      "    \"defType\": \"edismax\",\n",
      "    \"f0_0_query\": \"chef\"\n",
      "  },\n",
      "  \"facet\": {\n",
      "    \"f0_0\": {\n",
      "      \"type\": \"query\",\n",
      "      \"sort\": {\n",
      "        \"relatedness\": \"desc\"\n",
      "      },\n",
      "      \"facet\": {\n",
      "        \"relatedness\": {\n",
      "          \"type\": \"func\",\n",
      "          \"func\": \"relatedness($fore,$back)\"\n",
      "        },\n",
      "        \"f1_0\": {\n",
      "          \"type\": \"terms\",\n",
      "          \"limit\": 2,\n",
      "          \"sort\": {\n",
      "            \"relatedness\": \"desc\"\n",
      "          },\n",
      "          \"facet\": {\n",
      "            \"relatedness\": {\n",
      "              \"type\": \"func\",\n",
      "              \"func\": \"relatedness($fore,$back)\"\n",
      "            },\n",
      "            \"f2_0\": {\n",
      "              \"type\": \"terms\",\n",
      "              \"limit\": 10,\n",
      "              \"sort\": {\n",
      "                \"relatedness\": \"desc\"\n",
      "              },\n",
      "              \"facet\": {\n",
      "                \"relatedness\": {\n",
      "                  \"type\": \"func\",\n",
      "                  \"func\": \"relatedness($fore,$back)\"\n",
      "                }\n",
      "              },\n",
      "              \"mincount\": 5,\n",
      "              \"field\": \"body\"\n",
      "            }\n",
      "          },\n",
      "          \"mincount\": 5,\n",
      "          \"field\": \"category\"\n",
      "        }\n",
      "      },\n",
      "      \"field\": \"body\",\n",
      "      \"query\": \"{!edismax q.op=AND qf=body v=$f0_0_query}\"\n",
      "    }\n",
      "  }\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print_disambigutaion_request(\"chef\", context_limit=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Success!\n",
    "\n",
    "You've leveraged a semantic knowledge graph to find related terms for a query, performed query expansion based upon semantically-similar terms, explored multiple different way to impact precision and recall of queries through integrating semantically-augmented queries, generated content-based recommendations leveraging a semantic knowledge graph, explored arbitrary relationship types by traversing a semantic knowledge graph, and performed both query classification and query disambiguration using a semantic knowledge graph.\n",
    "\n",
    "Semantic knowledge graphs can be a powerful tool for understaning user intent and interpreting both queries and content based upon meaning instead of just text kewords.\n",
    "\n",
    "Up next: [Related Keyword Detection from Signals](../ch06/2.related-keywords-from-signals.ipynb)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "vscode": {
   "interpreter": {
    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
